{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#!pip install catboost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import datetime\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "#warnings.filterwarnings('ignore')\n",
    "#%matplotlib inline\n",
    "from sklearn.metrics import roc_auc_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据导入成功！\n"
     ]
    }
   ],
   "source": [
    "## 数据导入\n",
    "from sklearn.model_selection import train_test_split  \n",
    "from catboost import CatBoostClassifier\n",
    "train=pd.read_csv(\"data/data63592/train.csv\")\n",
    "testA=pd.read_csv(\"data/data63592/testA.csv\")\n",
    "print('数据导入成功！')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "numerical_fea = list(train.select_dtypes(exclude=['object']).columns)\n",
    "numerical_fea.remove('isDefault')\n",
    "numerical_fea.remove('n2')\n",
    "\n",
    "# from fancyimpute import KNN\n",
    "# train = KNN(k=5).fit_transform(train[numerical_fea])\n",
    "# testA = KNN(k=5).fit_transform(testA[numerical_fea])\n",
    "# train = pd.DataFrame(train)\n",
    "# testA = pd.DataFrame(testA)\n",
    "# print(train.head())\n",
    "# print(testA.head())\n",
    "\n",
    "train[numerical_fea] = train[numerical_fea].fillna(train[numerical_fea].median())\n",
    "testA[numerical_fea] = testA[numerical_fea].fillna(testA[numerical_fea].median())\n",
    "\n",
    "# train[numerical_fea] = train[numerical_fea].dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "数据预处理完成!\n"
     ]
    }
   ],
   "source": [
    "#数据预处理\n",
    "for data in [train]:\n",
    "    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')\n",
    "    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})\n",
    "    data['employmentLength'] = data['employmentLength'].map({'1 year':1,'2 years':2,'3 years':3,'4 years':4,'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'10+ years':10,'< 1 year':0})\n",
    "    data['subGrade'] = data['subGrade'].map({'E2':1,'D2':2,'D3':3,'A4':4,'C2':5,'A5':6,'C3':7,'B4':8,'B5':9,'E5':10,\n",
    "        'D4':11,'B3':12,'B2':13,'D1':14,'E1':15,'C5':16,'C1':17,'A2':18,'A3':19,'B1':20,\n",
    "        'E3':21,'F1':22,'C4':23,'A1':24,'D5':25,'F2':26,'E4':27,'F3':28,'G2':29,'F5':30,\n",
    "        'G3':31,'G1':32,'F4':33,'G4':34,'G5':35})\n",
    "    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))\n",
    "  #  data['n15']=data['n8']*data['n10']\n",
    "    \n",
    "for data in [testA]:\n",
    "    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')\n",
    "    data['grade'] = data['grade'].map({'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7})\n",
    "    data['employmentLength'] = data['employmentLength'].map({'1 year':1,'2 years':2,'3 years':3,'4 years':4,'5 years':5,'6 years':6,'7 years':7,'8 years':8,'9 years':9,'10+ years':10,'< 1 year':0})\n",
    "    data['subGrade'] = data['subGrade'].map({'E2':1,'D2':2,'D3':3,'A4':4,'C2':5,'A5':6,'C3':7,'B4':8,'B5':9,'E5':10,\n",
    "        'D4':11,'B3':12,'B2':13,'D1':14,'E1':15,'C5':16,'C1':17,'A2':18,'A3':19,'B1':20,\n",
    "        'E3':21,'F1':22,'C4':23,'A1':24,'D5':25,'F2':26,'E4':27,'F3':28,'G2':29,'F5':30,\n",
    "        'G3':31,'G1':32,'F4':33,'G4':34,'G5':35})\n",
    "    data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))\n",
    "\n",
    "print(\"数据预处理完成!\")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "sub=testA[['id']].copy()\n",
    "sub['isDefault']=0\n",
    "testA=testA.drop(['id','issueDate'],axis=1)\n",
    "data_x=train.drop(['isDefault','id','issueDate'],axis=1)\n",
    "data_y=train[['isDefault']].copy()\n",
    "x, val_x, y, val_y = train_test_split(  \n",
    "    data_x,  \n",
    "    data_y,  \n",
    "    test_size=0.2,  \n",
    "    random_state=1,  \n",
    "    stratify=data_y\n",
    ")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "col=['grade','subGrade','employmentTitle','homeOwnership','verificationStatus','purpose','postCode','regionCode',\n",
    "     'initialListStatus','applicationType','policyCode']\n",
    "for i in data_x.columns:\n",
    "    if i in col:\n",
    "        data_x[i] = data_x[i].astype('str')\n",
    "for i in testA.columns:\n",
    "    if i in col:\n",
    "        testA[i] = testA[i].astype('str')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "model=CatBoostClassifier(\n",
    "            loss_function=\"Logloss\",\n",
    "            eval_metric=\"AUC\",\n",
    "            task_type=\"GPU\",\n",
    "            learning_rate=0.1,\n",
    "            iterations=500,\n",
    "            random_seed=2020,\n",
    "            od_type=\"Iter\",\n",
    "            depth=7)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0:\tlearn: 0.5608061\ttest: 0.5727825\tbest: 0.5727825 (0)\ttotal: 58.3ms\tremaining: 29.1s\n",
      "100:\tlearn: 0.7183899\ttest: 0.7184298\tbest: 0.7184298 (100)\ttotal: 5.16s\tremaining: 20.4s\n",
      "200:\tlearn: 0.7251087\ttest: 0.7227840\tbest: 0.7227840 (200)\ttotal: 10.4s\tremaining: 15.5s\n",
      "300:\tlearn: 0.7285709\ttest: 0.7244499\tbest: 0.7244499 (300)\ttotal: 15.7s\tremaining: 10.4s\n",
      "400:\tlearn: 0.7311564\ttest: 0.7251973\tbest: 0.7252351 (399)\ttotal: 21s\tremaining: 5.19s\n",
      "499:\tlearn: 0.7331063\ttest: 0.7255938\tbest: 0.7256294 (488)\ttotal: 26.4s\tremaining: 0us\n",
      "bestTest = 0.7256293595\n",
      "bestIteration = 488\n",
      "Shrink model to first 489 iterations.\n",
      "cat验证的auc:0.5356385371560265\n",
      "0:\tlearn: 0.5612352\ttest: 0.5564913\tbest: 0.5564913 (0)\ttotal: 56.9ms\tremaining: 28.4s\n",
      "100:\tlearn: 0.7188204\ttest: 0.7172583\tbest: 0.7172583 (100)\ttotal: 5.33s\tremaining: 21.1s\n",
      "200:\tlearn: 0.7259205\ttest: 0.7219132\tbest: 0.7219132 (200)\ttotal: 10.7s\tremaining: 15.9s\n",
      "300:\tlearn: 0.7292400\ttest: 0.7236129\tbest: 0.7236129 (300)\ttotal: 16.1s\tremaining: 10.6s\n",
      "400:\tlearn: 0.7317732\ttest: 0.7245195\tbest: 0.7245195 (400)\ttotal: 21.6s\tremaining: 5.33s\n",
      "499:\tlearn: 0.7338174\ttest: 0.7249820\tbest: 0.7249835 (498)\ttotal: 26.9s\tremaining: 0us\n",
      "bestTest = 0.7249834538\n",
      "bestIteration = 498\n",
      "Shrink model to first 499 iterations.\n",
      "cat验证的auc:0.5344156645216898\n",
      "0:\tlearn: 0.5609912\ttest: 0.5716429\tbest: 0.5716429 (0)\ttotal: 59ms\tremaining: 29.4s\n",
      "100:\tlearn: 0.7183969\ttest: 0.7178843\tbest: 0.7178843 (100)\ttotal: 5.15s\tremaining: 20.4s\n",
      "200:\tlearn: 0.7252267\ttest: 0.7227836\tbest: 0.7227836 (200)\ttotal: 10.5s\tremaining: 15.6s\n",
      "300:\tlearn: 0.7285742\ttest: 0.7245652\tbest: 0.7245754 (299)\ttotal: 16.1s\tremaining: 10.6s\n",
      "400:\tlearn: 0.7311484\ttest: 0.7256560\tbest: 0.7256560 (400)\ttotal: 21.6s\tremaining: 5.33s\n",
      "bestTest = 0.7261014283\n",
      "bestIteration = 449\n",
      "Shrink model to first 450 iterations.\n",
      "cat验证的auc:0.5344705571902124\n",
      "0:\tlearn: 0.5606796\ttest: 0.5732092\tbest: 0.5732092 (0)\ttotal: 57.5ms\tremaining: 28.7s\n",
      "100:\tlearn: 0.7185597\ttest: 0.7201983\tbest: 0.7201983 (100)\ttotal: 5.27s\tremaining: 20.8s\n",
      "200:\tlearn: 0.7252544\ttest: 0.7248911\tbest: 0.7248911 (200)\ttotal: 10.6s\tremaining: 15.7s\n",
      "300:\tlearn: 0.7284499\ttest: 0.7262987\tbest: 0.7262987 (300)\ttotal: 15.9s\tremaining: 10.5s\n",
      "400:\tlearn: 0.7311707\ttest: 0.7272457\tbest: 0.7272457 (400)\ttotal: 21.4s\tremaining: 5.28s\n",
      "499:\tlearn: 0.7332811\ttest: 0.7277634\tbest: 0.7277770 (493)\ttotal: 26.7s\tremaining: 0us\n",
      "bestTest = 0.7277769744\n",
      "bestIteration = 493\n",
      "Shrink model to first 494 iterations.\n",
      "cat验证的auc:0.5341294895658407\n",
      "0:\tlearn: 0.5619664\ttest: 0.5700183\tbest: 0.5700183 (0)\ttotal: 58.8ms\tremaining: 29.3s\n",
      "100:\tlearn: 0.7192999\ttest: 0.7166145\tbest: 0.7166145 (100)\ttotal: 5.17s\tremaining: 20.4s\n",
      "200:\tlearn: 0.7257937\ttest: 0.7210211\tbest: 0.7210234 (197)\ttotal: 10.5s\tremaining: 15.6s\n",
      "300:\tlearn: 0.7291164\ttest: 0.7227467\tbest: 0.7227552 (297)\ttotal: 15.8s\tremaining: 10.5s\n",
      "400:\tlearn: 0.7317339\ttest: 0.7236987\tbest: 0.7236987 (400)\ttotal: 21.2s\tremaining: 5.22s\n",
      "499:\tlearn: 0.7339801\ttest: 0.7242326\tbest: 0.7242326 (499)\ttotal: 26.6s\tremaining: 0us\n",
      "bestTest = 0.7242325544\n",
      "bestIteration = 499\n",
      "cat验证的auc:0.5345586363094307\n",
      "mean valAuc:0.53464257694864\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "answers = []\n",
    "mean_score = 0\n",
    "n_folds = 5\n",
    "sk = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2021)\n",
    "for train, test in sk.split(data_x, data_y):\n",
    "    x_train = data_x.iloc[train]\n",
    "    y_train = data_y.iloc[train]\n",
    "    x_test = data_x.iloc[test]\n",
    "    y_test = data_y.iloc[test]\n",
    "    clf = model.fit(x_train,y_train, eval_set=(x_test,y_test),verbose=100,cat_features=col)\n",
    "    yy_pred_valid=clf.predict(x_test)\n",
    "    print('cat验证的auc:{}'.format(roc_auc_score(y_test, yy_pred_valid)))\n",
    "    mean_score += roc_auc_score(y_test, yy_pred_valid) / n_folds\n",
    "    y_pred_valid = clf.predict(testA,prediction_type='Probability')[:,-1]\n",
    "    answers.append(y_pred_valid)\n",
    "print('mean valAuc:{}'.format(mean_score))\n",
    "cat_pre=sum(answers)/n_folds\n",
    "sub['isDefault']=cat_pre\n",
    "sub.to_csv('data/submit.csv',index=False) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PaddlePaddle 1.8.4 (Python 3.5)",
   "language": "python",
   "name": "py35-paddle1.2.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
